from paddlemix.datacopilot.core import MMDataset
from paddlemix.datacopilot.ops.filter._maximum_line_length_filter import maximum_line_length_filter

# Path to the dataset
anno_path = 'random_samples.json'

# Load the dataset
print("Loading the dataset...")
dataset = MMDataset.from_json(anno_path)
print("Initial dataset size:", len(dataset))

# Apply the maximum line length filter
dataset = dataset.maximum_line_length_filter(
    min_length=10,  # Minimum maximum line length
    max_length=128  # Maximum maximum line length
)

# Print the size of the filtered dataset
print("Filtered dataset size:", len(dataset))
print("Maximum line length filtering complete.")

# Export the filtered dataset
dataset.export_json(anno_path.replace('.json', '_max_line_filtered.json'))